*******************************************************************************
* Created: October 2022
* Author: Michael Jensen
* Last Edited: Feb 2023 - MJ
* This file cleans the roster data sent from AISD. 
*The data sent over has an observation for each teacher-student pair with the 
*dates the student entered and exited that class. We want a student observation for each week.
*******************************************************************************



********************* Clean Student Characteristics ****************************
use "${rawdatapath}stud_char.dta", clear
drop dup

*Students who switched school have multiple rows. 
*This is only a problem for our "days_missed" variable. So we sum it by students then merge it back to the original dataset
destring days_missed, replace
collapse (sum) days_missed, by(new_st_id)
rename days_missed days_m
mmerge new_st_id using "${rawdatapath}stud_char.dta"
drop dup

*Delete duplicates
sort new_st_id
quietly by new_st_id:  gen dup = cond(_N==1,0,_n)
drop if dup > 1

*Use the new correct days_missed
drop days_missed
rename days_m days_missed
tostring days_missed, replace
save "${datapath}stud_char_new.dta", replace
********************************************************************************





********************************* Clean classes ********************************
*Imports roster data
use "${rawdatapath}roster_year1.dta", clear

*Merge with treatment status
mmerge new_t_id using "${rawdatapath}teacher_treatment_status.dta"

*Drop teachers who did not have students rostered this year
drop if _merge != 3
drop _merge

*We mark math classes using the course description. If they have "math" or "algebra" in the description they are a math course
gen math_class = 0
gen low_course = lower(course_description)
replace low_course = "temp " + low_course
gen temp = strpos(low_course, "math")
replace temp = strpos(low_course, "algebra") if temp == 0
replace math_class = 1 if temp > 0
drop temp
rename course_catagory course_category
replace course_category = "Math Enrichment" if strpos(low_course, "enrichment") > 0
drop low_course
drop if math_class ==0

*Drop if teachers did not teach all year
*drop if teacher_end_date != "2022-05-26 00:00:00"

*Currently the data has 1 observation for each teacher-student pair, with the dates they entered and exited that class_entry_date
*We want to convert to 1 observation per week per student, with their teacher for that week marked
*Make an observation for each teacher-student pair for each week August - May
expand 44
sort new_st_id new_t_id class_entry_date course_id
quietly by new_st_id new_t_id class_entry_date course_id:  gen week = cond(_N==1,0,_n)

mmerge week using "${rawdatapath}weeks.dta" // this dataset has the dates for each of the 44 weeks of the school year
drop _merge

*clean class entry and exit dates
replace class_entry_date = substr(class_entry_date, 1, 10)
gen week_temp = date(class_entry_date, "YMD")
format week_temp %td
drop class_entry_date
rename week_temp class_entry_date

replace class_exit_date = substr(class_exit_date, 1, 10)
gen week_temp = date(class_exit_date, "YMD")
format week_temp %td
drop class_exit_date
rename week_temp class_exit_date

*We drop obersvations where the student wasn't with that teacher for that week
drop if week_date < class_entry_date
drop if week_date > class_exit_date


*Drop observations where a student had 2 math classes with the same teacher on a given week (double blocked)
sort new_st_id new_t_id week
quietly by new_st_id new_t_id week:  gen doub = cond(_N==1,0,_n)
drop if doub > 1

*Drop observations where students had math classes with 2 different teachers in a given week
*NOTE: 1,185/391,260 week observations with a student and 2 teachers. 101 students were in both a treatment and control classroom (never the same week)
gen duration = class_exit_date - class_entry_date
sort new_st_id week -duration
quietly by new_st_id week:  gen dual = cond(_N==1,0,_n)
drop if dual > 1
********************************************************************************



**************************** KHAN DATA *****************************************
*Merge in ka minutes for those with an id.
destring new_st_id, replace
mmerge new_st_id week using "${datapath}ka_minutes_by_week.dta"
drop if _merge == 2
drop if new_st_id ==.

save "${datapath}students_in_math_class_w_ka_minutes.dta", replace

*Merge in average class practice on a given week and use those for students without practice

mmerge new_t_email week_date using "${datapath}ka_class_averages.dta", uname(avg_)
drop if _merge == 2
drop _merge

gen avg_dummy = 0
replace avg_dummy = 1 if total_minutes ==.

local replacelist total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned
foreach v of local replacelist {
	replace `v' = avg_`v' if avg_dummy == 1
}
replace avg_dummy = 0 if total_minutes ==.

*For those missing minutes and without a class average, we give them a practice of 0
gen missing_minutes = 0
replace missing_minutes = 1 if total_minutes ==.

foreach v of local replacelist {
	replace `v' = 0 if missing_minutes == 1
}
save "${datapath}students_minutes_full.dta", replace


*Add in the number of weeks they were in that given class
gen week_duration = 1
collapse (sum) week_duration, by(new_st_id new_t_id)

save "${datapath}weeks_in_class.dta", replace

use "${datapath}students_minutes_full.dta", clear

mmerge new_st_id new_t_id using "${datapath}weeks_in_class.dta"
drop _merge
save "${datapath}students_minutes_full_weeks.dta", replace
********************************************************************************



********************* Final cleaning and merge in test scores ******************
use "${datapath}students_minutes_full_weeks.dta", clear
*drop if week_duration < 9

*We try to pull a grade variable from the course description to potentially drop Grade 2 + Alg 1 (end up not doing this for main spec)
generate str20 s1 = ustrregexs(0) if ustrregexm(course_description,"[0-9,]+")
destring s1, ignore(",") generate(course_grade)
*drop if course_grade < 3
*drop if course_grade ==.
*drop if course_grade == 71
*drop if course_category != "Math"
preserve
use "${rawdatapath}2023_staar_cleaned_20_21.dta", replace
append using "${rawdatapath}2023_staar_cleaned_22_23.dta"
append using "${rawdatapath}2023_staar_cleaned"
quietly bys new_st_id:  gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop dup
save "${rawdatapath}2023_staar_cleaned_22_23.dta", replace
restore




* Add in test scores
mmerge new_st_id using "${rawdatapath}2022_staar_cleaned.dta"
/*
preserve
drop if _merge != 2
save "${datapath}students_no_staar.dta", replace
restore
*/
drop if _merge == 2
drop _merge
mmerge new_st_id using "${rawdatapath}2021_staar_cleaned.dta", uname(_21)
drop if _merge == 2
drop _merge
mmerge new_st_id using "${rawdatapath}2019_staar_cleaned.dta", uname(_19)
drop if _merge == 2
drop _merge
mmerge new_st_id using "${rawdatapath}2018_staar_cleaned.dta", uname(_18)
drop if _merge == 2
drop _merge

mmerge new_st_id using "${rawdatapath}2023_staar_cleaned_22_23.dta", uname(_23)
drop if _merge == 2
drop _merge

*Average all of a student's weekly data down to one observation per student/teacher
collapse (mean) online enrolled_grade grade_tested math_meets_glevel math_on_track math_percentile math_prev_yr_scale_score math_raw_score math_scale_score _21math_scale_score _19math_scale_score _18math_scale_score total_minutes msl_net msl_familiar msl_proficient msl_master week_duration treatment gradeschoolunit _21math_meets_glevel _21grade_tested _19grade_tested _18grade_tested teachergrade read_scale_score _21read_scale_score _21math_percentile _19math_percentile _18math_percentile _21math_raw_score _19math_raw_score _18math_raw_score _19read_scale_score _18read_scale_score _21math_pass math_pass _23*, by(new_st_id new_t_id)

*Drop any duplicates
gen temp = -week_duration
sort new_st_id temp
quietly by new_st_id:  gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop temp

* Merge in student characteristics and add labels
mmerge new_st_id using "${datapath}stud_char_new.dta"
drop if _merge == 2
gen temp = real(age)
drop age
rename temp age
gen female =.
replace female = 0 if per_gender == "M"
replace female = 1 if per_gender == "F"
gen speced =.
replace speced = 0 if spec_ed == "(null)"
replace speced = 1 if spec_ed == "1"
drop spec_ed
gen white = 0
replace white = 1 if student_race == "White"
gen black = 0 
replace black = 1 if student_race == "Black or African American"
gen hispanic = 0
replace hispanic = 1 if ethnicity == "HISPANIC"
gen native = 0
replace native = 1 if student_race == "American Indian or Alaska Native"
gen temp = real(days_missed)
drop days_missed
rename temp days_missed
gen esl_ind = 1
replace esl_ind = 0 if esl == "(null)"
gen freelunch = 0
replace freelunch = 1 if ecodis1 == "Eligible for free meals under the NSLCN Program"

local missingchar female speced white black hispanic native days_missed esl_ind age
foreach var of local missingchar {
	replace `var' =. if per_gender == ""
}

*Grade variables from our original measure of teacher grade
tab teachergrade, gen(grade_)
rename grade_6 grade_8
rename grade_5 grade_7
rename grade_4 grade_6
rename grade_3 grade_5
rename grade_2 grade_4
rename grade_1 grade_3

** Standardize Math Scores **
gen zmss_22 =.
gen zrss_22 =.
gen zmss_21 =.
gen zrss_21 =.
gen zmss_19 =.
gen zrss_19 =.
gen zmss_18 =.
gen zrss_18 =.
gen zmss_23 =.

forvalues i = 3(1)8 {
	qui summ math_scale_score if treatment == 0 & grade_tested == `i'
	replace zmss_22 = (math_scale_score - `r(mean)') / `r(sd)' if grade_tested == `i'
	qui summ read_scale_score if treatment == 0 & grade_tested == `i'
	replace zrss_22 = (read_scale_score - `r(mean)') / `r(sd)' if grade_tested == `i'
	
	qui summ _23math_scale_score if treatment == 0 & _23grade_tested == `i'
	replace zmss_23 = (math_scale_score - `r(mean)') / `r(sd)' if _23grade_tested == `i'

}

forvalues i = 3(1)8 {
	qui summ _21math_scale_score if treatment == 0 & _21grade_tested == `i'
	replace zmss_21 = (_21math_scale_score - `r(mean)') / `r(sd)' if _21grade_tested == `i'
	qui summ _21read_scale_score if treatment == 0 & _21grade_tested == `i'
	replace zrss_21 = (_21read_scale_score - `r(mean)') / `r(sd)' if _21grade_tested == `i'
}

forvalues i = 3(1)6 {
    qui summ _19math_scale_score if treatment == 0 & _19grade_tested == `i'
	replace zmss_19 = (_19math_scale_score - `r(mean)') / `r(sd)' if _19grade_tested == `i'
	qui summ _19read_scale_score if treatment == 0 & _19grade_tested == `i'
	replace zrss_19 = (_19read_scale_score - `r(mean)') / `r(sd)' if _19grade_tested == `i'
}

forvalues i = 3(1)5 {
    qui summ _18math_scale_score if treatment == 0 & _18grade_tested == `i'
	replace zmss_18 = (_18math_scale_score - `r(mean)') / `r(sd)' if _18grade_tested == `i'
	qui summ _18read_scale_score if treatment == 0 & _18grade_tested == `i'
	replace zrss_18 = (_18read_scale_score - `r(mean)') / `r(sd)' if _18grade_tested == `i'
}

*Average class minutes and lu (leave individual out)
gen temp = 1
egen nclassmates = total(temp), by(new_t_id) 
replace nclassmates = nclassmates - temp 		// number of classmates
drop temp
egen totalclassminutes = total(total_minutes), by(new_t_id)
replace totalclassminutes = totalclassminutes - total_minutes
gen classminutes = totalclassminutes/nclassmates

egen totalclasslu = total(msl_net), by(new_t_id)
replace totalclasslu = totalclasslu - msl_net
gen classlu = totalclasslu/nclassmates

drop if online == 1

save "${datapath}student_level_all_data.dta", replace




use "${datapath}student_level_all_data.dta", clear

mmerge new_t_id using "${rawdatapath}kholl_teach.dta"
drop if _merge == 2
drop _merge

*I create dummy variables for all of the grade and demographic cuts I want for looping
gen all = 1
gen grade36 = 0
replace grade36 = 1 if grade_3 == 1 | grade_4 == 1 | grade_5 == 1 | grade_6 == 1
gen grade78 = -grade36 + 1
gen grade3= teachergrade==3
gen grade4= teachergrade==4
gen grade5= teachergrade==5
gen grade6= teachergrade==6
gen grade7= teachergrade==7
gen grade8 = teachergrade==8
gen male = -female +1
gen nonspec = -speced + 1
gen nofreelunch = -freelunch + 1
gen allgrades = all

gen class_minutes1099 = classminutes>=10
gen class_minutes1599 = classminutes>=15
gen class_minutes2099 = classminutes>=20
gen class_minutes2599 = classminutes>=25
gen class_minutes3099 = classminutes>=30
gen class_minutes3599 = classminutes>=35
gen class_minutes4099 = classminutes>=40
gen class_minutes4599 = classminutes>=45
gen class_minutes5099 = classminutes>=50
gen class_minutes5599 = classminutes>=55
gen class_minutes6099 = classminutes>=60

gen class_lu0199 = classlu>=1
gen class_lu0299 = classlu>=2
gen class_lu0399 = classlu>=3
gen class_lu0499 = classlu>=4
gen class_lu0599 = classlu>=5

* Labeling
label variable female 					"Female"
label variable black 					"Race: Black"
label variable white 					"Race: White"
label variable native 					"Race: Native American"
label variable hispanic 				"Ethnicity: Hispanic"
label variable grade_3 					"3rd Grade"
label variable grade_4 					"4th Grade"
label variable grade_5 					"5th Grade"
label variable grade_6 					"6th Grade"
label variable grade_7 					"7th Grade"
label variable grade_8 					"8th Grade"
label variable speced 					"Special Ed"
label variable days_missed 				"Days Missed"
label variable esl_ind 					"ESL"
label variable week_duration 			"Weeks with Teacher"
label variable _21math_scale_score 		"2021 STAAR Math Scale Score"
label variable _19math_scale_score 		"2019 STAAR Math Scale Score"
label variable _18math_scale_score 		"2018 STAAR Math Scale Score"
label variable age 						"Age"
label variable native 					"Race: Native American"
label variable speced 					"Special Ed students"
label variable freelunch 				"Free lunch"

label variable all 						"All students"
label variable male 					"Male"
label variable nonspec 					"Non-Special Ed"
label variable freelunch 				"Free lunch Eligible"
label variable nofreelunch 				"Non-free lunch"
label variable allgrades 				"All Grades"
label variable grade36 					"Grades 3-6"
label variable grade78 					"Grades 7-8"
label variable grade3 					"Grade 3"
label variable grade4 					"Grade 4"
label variable grade5 					"Grade 5"
label variable grade6 					"Grade 6"
label variable grade7 					"Grade 7"
label variable grade8 					"Grade 8"

save "${datapath}student_level_all_data.dta", replace